Load required libraries.
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(grid)
library(wordbankr)
library(langcog)
theme_set(theme_mikabr())Load in Wordbank data.
items <- get_item_data() %>%
filter(type == "word") %>%
mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))Get vocabulary composition data for all languages.
get_vocab_comp <- function(input_language, input_form) {
lang_vocab_items <- filter(items, language == input_language, form == input_form) %>%
filter(lexical_category %in% c("nouns", "predicates", "function_words"))
lang_vocab_data <- get_instrument_data(instrument_language = input_language,
instrument_form = input_form,
items = lang_vocab_items$item_id,
iteminfo = lang_vocab_items) %>%
mutate(value = ifelse(is.na(value), "", value),
produces = value == "produces",
understands = value == "produces" | value == "understands") %>%
select(-value) %>%
gather(measure, value, produces, understands)
num_words <- nrow(lang_vocab_items)
lang_vocab_summary <- lang_vocab_data %>%
group_by(data_id, measure, lexical_category) %>%
summarise(num_true = sum(value),
prop = sum(value) / n())
lang_vocab_sizes <- lang_vocab_summary %>%
summarise(vocab = sum(num_true) / num_words)
lang_vocab_summary %>%
left_join(lang_vocab_sizes) %>%
select(-num_true) %>%
mutate(language = input_language, form = input_form)
}instruments <- items %>%
select(language, form) %>%
distinct()
vocab_comp_data <- map2(instruments$language, instruments$form, get_vocab_comp) %>%
bind_rows()Show sample size of each instrument.
sample_sizes <- vocab_comp_data %>%
group_by(language, form, measure, lexical_category) %>%
summarise(n = n()) %>%
ungroup() %>%
select(language, form, n) %>%
distinct()
kable(sample_sizes)| language | form | n |
|---|---|---|
| British Sign Language | WG | 161 |
| Cantonese | WS | 987 |
| Croatian | WG | 250 |
| Croatian | WS | 377 |
| Danish | WS | 3714 |
| English | WG | 2454 |
| English | WS | 5824 |
| German | WS | 1183 |
| Hebrew | WG | 62 |
| Hebrew | WS | 253 |
| Italian | WG | 648 |
| Italian | WS | 752 |
| Mandarin | TC | 652 |
| Mandarin | WS | 1056 |
| Norwegian | WG | 3025 |
| Norwegian | WS | 12969 |
| Russian | WG | 768 |
| Russian | WS | 1037 |
| Spanish | WG | 778 |
| Spanish | WS | 1094 |
| Swedish | WG | 474 |
| Swedish | WS | 900 |
| Turkish | WG | 1115 |
| Turkish | WS | 2422 |
Base plot for looking at vocabulary composition.
base_plot <- function(input_form, input_measure) {
vocab_comp_data %>%
filter(form == input_form, measure == input_measure, language != "Hebrew") %>%
mutate(lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns", "Predicates", "Function Words"))) %>%
ggplot(aes(x = vocab, y = prop, colour = lexical_category)) +
facet_wrap(~language, ncol = 3) +
geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Proportion of Category\n") +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "\nVocabulary Size") +
scale_color_solarized(name = "") + # "Lexical Category") +
theme(legend.position = c(0.068, 0.95),
#legend.text = element_text(size = 9),
legend.title = element_text(lineheight = unit(0.8, "char")), #size = 9),
legend.key.height = unit(0.8, "char"),
#legend.key.width = unit(0.3, "cm"),
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"))
}Plot WS productive vocabulary composition as a function of vocabulary size for each language.
base_plot("WS", "produces") + geom_jitter(size = 0.7)Plot WG productive vocabulary composition as a function of vocabulary size for each language.
base_plot("WG", "produces") + geom_jitter(size = 0.7)Plot WG receptive vocabulary composition as a function of vocabulary size for each language.
base_plot("WG", "understands") + geom_jitter(size = 0.7)Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WS", "produces") +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WG", "produces") +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WG", "understands") +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)base_plot("WS", "produces") +
geom_jitter(size = 0.7, alpha = 0.5) +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1, size = 1) +
theme_mikabr(base_size = 20) +
theme(legend.position = "top")#ggsave("BUCLD/data_models.png", width = 15, height = 17, dpi = 300)num_admins <- vocab_comp_data %>%
filter(form == "WS") %>%
distinct(data_id) %>%
group_by(language) %>%
summarise(N = n())
num_items <- items %>%
filter(form == "WS", lexical_category %in% c("nouns", "predicates", "function_words")) %>%
group_by(language, lexical_category) %>%
summarise(n = n()) %>%
spread(lexical_category, n)
sample_sizes <- num_admins %>%
left_join(num_items) %>%
select(language, N, nouns, predicates, function_words) %>%
rename(Language = language, Nouns = nouns, Predicates = predicates, `Function Words` = function_words)
kable(sample_sizes)| Language | N | Nouns | Predicates | Function Words |
|---|---|---|---|---|
| Cantonese | 987 | 316 | 256 | 108 |
| Croatian | 377 | 312 | 166 | 139 |
| Danish | 3714 | 316 | 166 | 128 |
| English | 5824 | 312 | 166 | 102 |
| German | 1183 | 270 | 154 | 93 |
| Hebrew | 253 | 322 | 151 | 49 |
| Italian | 752 | 312 | 166 | 92 |
| Mandarin | 1056 | 322 | 260 | 113 |
| Norwegian | 12969 | 316 | 170 | 132 |
| Russian | 1037 | 314 | 182 | 102 |
| Spanish | 1094 | 312 | 166 | 102 |
| Swedish | 900 | 339 | 167 | 97 |
| Turkish | 2422 | 297 | 207 | 84 |
#png("BUCLD/sample_sizes.png", width = 1080/72, height = 540/72, res = 300, units = "in")
#textplot(sample_sizes, show.rownames = FALSE, mar = c(0, 0, 0, 0), cmar = 4)
#dev.off()Function for resampling data.
sample_areas <- function(d, nboot = 1000) {
poly_area <- function(group_data) {
model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1,
data = group_data)
return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1])
}
counter <- 1
sample_area <- function(d) {
d_frame <- d %>%
group_by(language, form, measure) %>%
sample_frac(replace = TRUE) %>%
group_by(language, form, measure, lexical_category) %>%
do(area = poly_area(.)) %>%
mutate(area = area[1]) %>%
rename_(.dots = setNames("area", counter))
counter <<- counter + 1 # increment counter outside scope
return(d_frame)
}
areas <- replicate(nboot, sample_area(d), simplify = FALSE)
Reduce(left_join, areas) %>%
gather(sample, area, -language, -form, -measure, -lexical_category)
}Resample data, compute area for each sample, find the mean and CI of the area estimate.
areas <- sample_areas(vocab_comp_data, 1000)
area_summary <- areas %>%
group_by(language, form, measure, lexical_category) %>%
summarise(mean = mean(area),
ci_lower = ci_lower(area),
ci_upper = ci_upper(area)) %>%
ungroup() %>%
mutate(language = factor(language),
instrument = paste(language, form))
area_order <- filter(area_summary, form == "WS", measure == "produces",
lexical_category == "nouns")
language_levels <- area_order$language[order(area_order$mean,
area_order$language,
decreasing = FALSE)]
area_summary_ordered <- area_summary %>%
filter(form %in% c("WS", "WG"),
!(form == "WS" & measure == "understands")) %>%
ungroup() %>%
mutate(language = factor(language, levels = language_levels),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns", "Predicates", "Function Words")))Plot each lexical category’s area estimate by language, form, and measure.
ggplot(area_summary_ordered,
aes(y = language, x = mean, colour = lexical_category)) +
facet_grid(lexical_category ~ form + measure) +
geom_point() +
geom_segment(aes(x = ci_lower, xend = ci_upper,
y = language, yend = language)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
scale_colour_solarized(name = "", guide = FALSE) +
scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
xlab("\nRelative representation in early vocabulary")Plot each lexical category’s area estimate by language and measure for WS only.
ggplot(filter(area_summary_ordered, form == "WS"),
aes(y = language, x = mean, col = lexical_category)) +
facet_grid(. ~ lexical_category) +
geom_point() +
geom_segment(aes(x = ci_lower, xend = ci_upper,
y = language, yend = language)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
scale_colour_solarized(name = "", guide = FALSE) +
scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
xlab("\nRelative representation in early vocabulary") +
theme_mikabr(base_size = 20)#ggsave("BUCLD/diffs.png", width = 1080/72, height = 540/72)Demo plots of English and Mandarin data and models.
demo_langs <- c("English", "Mandarin")
demo_data <- filter(vocab_comp_data, form == "WS", language %in% demo_langs) %>%
mutate(panel = paste(language, "(data)"),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns", "Predicates", "Function Words")))
pts <- seq(0, 1, 0.01)
models <- demo_data %>%
group_by(language, lexical_category) %>%
do(model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1, data = .))
get_lang_lexcat_predictions <- function(lang, lexcat) {
model <- filter(models, language == lang, lexical_category == lexcat)$model[[1]]
data.frame(vocab = pts,
prop = predict(model, newdata = data.frame(vocab = pts)),
lexical_category = lexcat,
language = lang)
}
get_lang_predictions <- function(lang) {
bind_rows(sapply(unique(demo_data$lexical_category),
function(lexcat) get_lang_lexcat_predictions(lang, lexcat),
simplify = FALSE))
}
predictions <- bind_rows(sapply(demo_langs, get_lang_predictions, simplify = FALSE))
diagonal <- expand.grid(vocab = rep(rev(pts)),
language = demo_langs,
lexical_category = unique(demo_data$lexical_category))
diagonal$prop <- diagonal$vocab
area_poly <- bind_rows(predictions, diagonal) %>%
mutate(panel = paste(language, "(models)"))ggplot(demo_data,
aes(x = vocab, y = prop, colour = lexical_category, fill = lexical_category)) +
facet_grid(~ panel) +
geom_point(size = 0.7) +
geom_polygon(data = area_poly, alpha = 0.2) +
geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Proportion of Category\n") +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "\nVocabulary Size") +
scale_color_solarized(guide = FALSE) +
scale_fill_solarized(name = "") +
theme(legend.position = c(0.061, 0.91),
legend.text = element_text(size = 8),
legend.key.height = unit(0.9, "char"),
legend.key.width = unit(0.88, "char"),
legend.background = element_rect(fill = "transparent"))